This Markdown file is the first part of this analysis.
I use a unique dataset that contains information on 47.006 Airbnb listings from seven major German cities, namely Berlin, Munich, Hamburg, Cologne, Dresden, Stuttgart and Frankfurt am Main. Listings were gathered directly from Airbnb’s website in September 2017 using a custom web scraper. The dataset includes all publicly available information for a listing, including but not limited to prices, accommodation features, reviews and host details.
print(paste0("Number of rows: ", dim(rooms)[1]))## [1] "Number of rows: 47006"
print(paste0("Number of columns: ", dim(rooms)[2]))## [1] "Number of columns: 62"
str(rooms)## Classes 'tbl_df', 'tbl' and 'data.frame': 47006 obs. of 62 variables:
## $ room_id : int 19117409 5728058 19954984 9918551 13836114 20355318 18732461 12021779 18019626 20121368 ...
## $ host_id : int 133588182 333588 140968262 50992051 81617924 80225160 49157795 7901771 2307050 20759906 ...
## $ room_type : chr "Entire home/apt" "Entire home/apt" "Entire home/apt" "Entire home/apt" ...
## $ country : chr "Deutschland" "Deutschland" "Deutschland" "Deutschland" ...
## $ city : chr "Hamburg" "Hamburg" "München" "Schönefeld" ...
## $ neighborhood : chr NA NA NA NA ...
## $ address : chr "Othmarschen, Hamburg" "Neustadt, Hamburg" "Schwabing - West, München" "Schönefeld" ...
## $ price : int 129 116 91 43 61 49 120 120 145 91 ...
## $ nightly_price : int 129 116 91 43 61 49 120 120 145 91 ...
## $ reviews : int 3 24 10 0 13 1 10 11 4 1 ...
## $ accommodates : int 2 2 6 1 2 2 5 6 5 4 ...
## $ bathrooms : int 1 1 1 1 1 1 1 1 2 1 ...
## $ bedrooms : int 1 1 2 0 1 1 3 2 3 1 ...
## $ bed_type : chr "Real Bed" "Real Bed" "Real Bed" "Real Bed" ...
## $ minstay : int 2 3 2 3 1 2 2 2 6 2 ...
## $ last_modified : POSIXct, format: "2017-09-27 08:47:10" "2017-09-27 08:47:27" ...
## $ latitude : num 53.6 53.6 48.2 52.4 53.6 ...
## $ longitude : num 9.9 9.98 11.56 13.44 9.98 ...
## $ survey_id : int 7 7 2 1 7 3 7 2 1 2 ...
## $ location : chr NA NA NA NA ...
## $ coworker_hosted : chr NA NA NA NA ...
## $ extra_host_languages : chr "{en}" "{en}" "{en}" "{en,fr}" ...
## $ name : chr "Komfortable Erdgeschosswohnung mit Südterrasse." "Cozy city apartment - very central" "EmiLi - Helle, gemütliche Wohnung in bester Lage" "Einliegerwohnung auf dem Mauerweg" ...
## $ property_type : chr "Wohnung" "Wohnung" "Wohnung" "Bed & Breakfast" ...
## $ currency : chr "EUR" "EUR" "EUR" "EUR" ...
## $ rate_type : chr "nightly" "nightly" "nightly" "nightly" ...
## $ overall_satisfaction : chr "100" "96" "100" NA ...
## $ cleanliness_satisfaction : int 10 10 10 NA 10 10 10 9 10 8 ...
## $ communication_satisfaction: int 10 10 10 NA 10 10 10 9 10 6 ...
## $ location_satisfaction : int 10 10 10 NA 10 10 10 9 10 8 ...
## $ accuracy_satisfaction : int 9 10 10 NA 10 10 10 9 10 10 ...
## $ checkin_satisfaction : int 10 10 10 NA 10 10 10 10 10 6 ...
## $ value_satisfaction : chr "10" "10" "10" NA ...
## $ amenities : chr "{128,1,129,4,8,9,21,91,92,93,30,94,31,95,96,33,98,35,99,100,101,40,41,44,110,111,112,113,50,115,116,120,57,121,61,127}" "{1,49,50,35,8,40,28,44,45,30,46}" "{33,129,35,4,38,8,40,73,42,44,45,46,47,28,61,30}" "{33,34,35,4,37,38,39,8,40,9,41,44,45,46,47,16,49,28,30,31}" ...
## $ cancel_policy : chr "4" "5" "3" "3" ...
## $ instant_book : chr "false" "false" "true" "false" ...
## $ response_time : chr "51118" "1000" "1" "28566" ...
## $ response_rate : num 1 1 1 1 1 0.5 1 1 1 1 ...
## $ friend_count : int 0 0 0 0 0 0 0 0 0 0 ...
## $ wishlist_count : int 14 90 27 17 26 0 43 73 20 4 ...
## $ pic_count : chr "12" "4" "7" "5" ...
## $ superhost : chr "false" "false" "false" "false" ...
## $ description_language : chr "de" "de" "de" "de" ...
## $ hostname : chr "Michael" "Nana" "Lina & Emily" "Liliana" ...
## $ rule_children : chr "true" "false" "true" "true" ...
## $ rule_infants : chr "false" "false" "false" "true" ...
## $ rule_pets : chr "false" "false" "false" "false" ...
## $ rule_smoking : chr "false" "false" "false" "false" ...
## $ rule_events : chr "false" "false" "false" "false" ...
## $ hostprofilepic : chr "https://a0.muscache.com/im/pictures/7e75a61b-5240-4867-b496-f7efdb564053.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/users/333588/profile_pic/1406487683/original.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/pictures/02b39cd9-1fd4-498e-b830-203f11919ee2.jpg?aki_policy=profile_x_medium" "https://a0.muscache.com/im/pictures/46deaa24-5700-45ed-b0ee-7a81b552da7f.jpg?aki_policy=profile_x_medium" ...
## $ cleaning_fee : chr "20" NA NA NA ...
## $ security_deposit : chr NA NA NA NA ...
## $ last_review : POSIXct, format: "2017-09-09 13:37:58" "2017-06-18 11:33:06" ...
## $ positive_reviews : POSIXct, format: NA NA ...
## $ negative_reviews : Date, format: NA NA ...
## $ last_cal_update : chr "2017-06-22" "2017-09-18" "2017-09-04" "2017-09-20" ...
## $ member_since : chr "Juni 2017" "Januar 2011" "Juli 2017" "Dezember 2015" ...
## $ host_verified : chr "TRUE" "TRUE" "FALSE" "FALSE" ...
## $ deleted : chr "0" "0" "0" "0" ...
## $ filled : chr "TRUE" "TRUE" "TRUE" "TRUE" ...
## $ description : chr "Die 80 qm große Wohnung ist im Erdgeschoß gelegen und sehr gut ausgestattet. Es gibt eine moderne Küche mit Ess"| __truncated__ "Bright, quiet, fully furnished, in the middle of Hamburg – great central suburb „Neustadt“. Fully equipped + li"| __truncated__ "Super schöne, sehr helle Wohnung. Stilvoll und mit viel Liebe eingerichtet. In top Lage! Karstadt, Rewe, Lidl,"| __truncated__ "- sehr ruhige Lage im Süden Berlins; 150 m zum Bus - besteht aus einem Zimmer (23,10 qm) mit integrierter Küche"| __truncated__ ...
## $ base_price : chr NA NA NA NA ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 62
## .. ..$ room_id : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ host_id : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ room_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ country : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ city : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ neighborhood : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ address : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ nightly_price : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ reviews : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ accommodates : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ bathrooms : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ bedrooms : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ bed_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ minstay : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ last_modified :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
## .. ..$ latitude : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ longitude : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ survey_id : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ location : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ coworker_hosted : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ extra_host_languages : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ name : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ property_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ currency : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ rate_type : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ overall_satisfaction : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ cleanliness_satisfaction : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ communication_satisfaction: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ location_satisfaction : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ accuracy_satisfaction : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ checkin_satisfaction : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ value_satisfaction : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ amenities : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ cancel_policy : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ instant_book : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ response_time : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ response_rate : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ friend_count : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ wishlist_count : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ pic_count : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ superhost : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ description_language : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ hostname : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ rule_children : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ rule_infants : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ rule_pets : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ rule_smoking : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ rule_events : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ hostprofilepic : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ cleaning_fee : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ security_deposit : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ last_review :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
## .. ..$ positive_reviews :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_datetime" "collector"
## .. ..$ negative_reviews :List of 1
## .. .. ..$ format: chr ""
## .. .. ..- attr(*, "class")= chr "collector_date" "collector"
## .. ..$ last_cal_update : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ member_since : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ host_verified : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ deleted : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ filled : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ description : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ base_price : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Convert strings to numeric
rooms <- rooms %>%
mutate(overall_satisfaction = as.numeric(overall_satisfaction),
pic_count = as.numeric(pic_count)) %>%
filter(!is.na(overall_satisfaction))Keep listings from the following cities: Hamburg, München, hamburg, Köln, FFM, Dresden, Stuttgart
## create clean-up function
create_city <- function(x, city){
city_clean <- ifelse(grepl(x, city),x , city)
return(city_clean)
}city_list <- c("Hamburg","München","Berlin","Frankfurt","Köln","Stuttgart","Dresden")
for(i in city_list){
rooms$city <- create_city(i, rooms$city)
}
rooms %>%
filter(city %in% city_list) -> rooms
rooms %>%
group_by(city) %>%
tally() %>%
ggplot(aes(reorder(city, n, desc),n)) +
geom_col(fill = col[3], alpha = 0.8) +
labs(x="", y="", title="Count")rooms %>%
group_by(property_type) %>%
tally() %>%
ggplot(aes(reorder(property_type, n),n)) +
geom_col(fill = col[3], alpha = 0.8) +
labs(x="", y="", title="Property Types") +
coord_flip()To keep things simple, I will just keep listings of property type “Wohnung” (apartment)
rooms %>%
filter(property_type == "Wohnung") -> roomsrooms %>%
ggplot(aes(room_type)) +
geom_bar(fill = col[3], alpha = 0.8) +
labs(x="", y="")rooms %>%
ggplot(aes(city, price)) +
geom_boxplot(outlier.size = 0)Apparently, there are some outliers. After cheking the respective listings, I decided to exclude them.
rooms %>%
filter(price < 1500) -> roomsrooms$price.cut <- cut(rooms$price, c(seq(0,500,1), Inf))
rooms %>%
ggplot(aes(as.numeric(price.cut), factor(city))) +
geom_density_ridges(scale = 5,
fill = col[3], alpha = 0.7,
color = "white") +
theme_ridges() +
scale_x_continuous(expand = c(0, 0), labels = c(seq(0,400,100),">500")) +
labs(y="", x="Price")rooms %>%
ggplot(aes(overall_satisfaction, factor(room_type))) +
geom_density_ridges(scale = 5,
fill = col[3], alpha = 0.7,
color = "white") +
scale_x_continuous(expand = c(0, 0)) +
labs(y="", x="Rating")Next, I exclude listings with less than three reviews, as it can be assumed that these listings have never been booked, or only very little.
rooms %>%
filter(reviews >= 3) -> roomsrooms$reviews.cut <- cut(rooms$reviews, c(seq(0,50,1), Inf))
rooms %>%
ggplot(aes(as.numeric(reviews.cut), factor(city))) +
geom_density_ridges(scale = 5,
fill = col[3], alpha = 0.7,
color = "white") +
scale_y_discrete(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0),
breaks = c(seq(0,50,10)),
labels = c(seq(0,40,10),">50")) +
labs(y="", x="Number of Reviews")df <- rooms %>%
select(room_id, name,
description, city, price, overall_satisfaction,
room_type, bed_type, pic_count,
reviews, accommodates, bedrooms, minstay,
latitude, longitude) %>%
mutate(fulltext = paste(name, description, sep=" "))Turning to the text data, lets first have a quick look at three random descriptions:
rooms %>% sample_n(3) %>%
select(description) %>%
knitr::kable(align = "l")| description |
|---|
| I offer a very cozy room with a double bed, a couch and a Desk, 10 Min from Central Station. Wifi is available Es ist ein hübsches Zimmer mit einem Doppelbett, einer Couch und einem Schreibtisch, 10 Minuten vom Hauptbahnhof. Wifi gibts auch |
| Unser 1-Zimmer Nichtraucher-Appartment mit ca. 28qm befindet sich in einem ruhigen Mehrfamilienhaus in Frankfurt Hausen unmittelbar der U-Bahn Haltestelle "Große Nelkenstraße". Ideal für Besucher sowie Geschäftsreisende, die eine schnelle Anbindung Richtung Innenstadt/Messe möchten. Abholung am Flughafen möglich; wir sind absolut zuverlässig.Das Appartment ist vollständig neu renoviert (Laminat/Fließen) und neu eingerichtet mit einem großen Doppelbett, Schreibtisch, Kochgelegenheit sowie selbstverständlich eigenes Bad/WC/Dusche. Selbstverständlich HD-TV sowie high-speed W-LAN (50.000) und ein gefüllter Kühlschrank inklusive. Mindestmietdauer 5 Tage. Endreinigung 49,- EUR Einkaufsgelegenheiten in fußläufiger Entfernung, Parkplätze problemlos.Sonderpreise für Aufenthalte von über einer Woche bis zu einem Monat (ausserhalb der Messezeiten) auf Anfrage.Absolut privat und abgeschlossen. |
| Altbau-Wohnung im Herzen Hamburgs. 50qm, max. 3 Personen. 1 Doppelbett - 210cm x 160cm PERFEKT FÜR LANGE LULATSCHE UND ALLE DIE PLATZ BRAUCHEN, 1 Wohnzimmercouch 160cm. Badezimmer mit Dusche, Küche mit Spülmaschine, geräumiges Wohnzimmer, Wifi. |
In which languages are the descriptions written?
load(file = "../output/prep1.Rda")df %>% group_by(language) %>%
tally() %>%
ggplot(aes(reorder(language, n),n)) +
geom_col(fill = col[3], alpha = 0.7) +
coord_flip() +
labs(x="",y="")Check sample articles if the classification is valid
df %>%
sample_n(5) %>%
select(fulltext, language) %>%
knitr::kable()| fulltext | language |
|---|---|
| Charmantes Zimmer im Herzen Schwabings Die Unterkunft befindet sich in einer der schönsten Ecken Schwabings, Restaurants, Cafes, Geschäfte direkt ums Eck, die Münchner Freiheit mit bester Verkehrsanbindung (U-Bahn, Straßenbahn, Bus) weniger als 5 Min zu Fuß entfernt. In die Innenstadt bzw. zum Marienplatz braucht man mit der U-Bahn nur ca. 10 Minuten, der Englische Garten ist auch nur wenige Minuten entfernt. Das Zimmer ist Teil einer schönen Altbauwohnung, deren Bad und Küche Du mit mir teilst. | german |
| Weddinger Paradies Die Ideale Unterkunft für alle alter! Liegt zentral in Weddingerkiez mit vielfälltige Einkaufs,Transport und vergnügen Gelegenheit. Neurenovierte altbau Wohnung mit genügend Fläche zum loslassen !! Die Wohnung liegt im Hochparterre des Hauses! | german |
| Belle Epoque Apartment in Berlin Welcome in one of the most autentic flat in Berlin. A small door will you bring back directly in the 20s years of the Berlin s “Belle epoque”. Old stripped pine floorboards, spacious high ceilings in traditional Berlin style, a big classic sofa of the 20s and a tester bed have been put together to create an unique atmosfeare of a forgotten time. The kitchen (with all things that are necessary for cooking and eating) and the bathroom (with a bathtub) have been also renewed following this style. | english |
| 3 Room Apartment central Berlin for family Zentral gelegene Wohnung, 5 min bis zum U-Bahnhof (Nauener Platz), sehr hell und perfekt für Familien eingerichtet. Wir haben eine 3 jährige und eine 1,5 jährige Tochter. This apartment is in central Berlin, close to the underground railway. We live here with our two children (3 years and 1.5 years) and everything is at hand such as the highchairs, the babybed, books, toys and so on. | english |
| Im Herzen Schwabings! Wohnen im ruhigen Teil des In-Viertels Schwabing! Unsere Wohnung liegt in unmittelbarer Nähe des Luitpold- & Olympiaparks. U-Bahn, Tram & Bus 200 m entfernt. In nur 5 Minuten erreicht man den Hauptbahnhof. Herzlich Willkommen in Schwabing! | middle_frisian |
Ok, looks good. Lets only keep listings with german and english descriptions.
df %>%
filter(language %in% c("german","english")) -> dfggplot(df, aes(x=factor(city))) +
geom_bar(aes(fill = language),
alpha = 0.8) +
labs(x="", y="", fill="")It is not surprising that Berlin seems to be the most international city, measured by the listings that have their description in English. But I am a little disappointed with Hamburg…
How long are the descriptions on average?
df$text_length <- sapply(gregexpr("\\S+", df$fulltext), length)df$text_length.cut <- cut(df$text_length, c(seq(0,150,1),Inf))
df %>%
ggplot(aes(as.numeric(text_length.cut), factor(city))) +
geom_density_ridges(aes(fill = language),
color = "white", alpha = 0.8) +
scale_x_continuous(expand = c(0,0),
labels = c(seq(0,100,50),">150")) +
labs(y = "", x = "Word Count", fill= "") +
theme()Surprisingly, the English texts are longer.
Next, I have to pre-process the text data to be able to include it into my model. Text data is inherently high-dimensional, so to reduce this dimensionality the following steps will be applied:
df$text_cleaned <- gsub("[[:punct:]]", " ", df$fulltext)
df$text_cleaned <- gsub("[[:cntrl:]]", " ", df$text_cleaned)
df$text_cleaned <- gsub("[[:digit:]]", " ", df$text_cleaned)
df$text_cleaned <- gsub("^[[:space:]]+", " ", df$text_cleaned)
df$text_cleaned <- gsub("[[:space:]]+$", " ", df$text_cleaned)
df$text_cleaned <- tolower(df$text_cleaned)df$text_cleaned <- removeWords(df$text_cleaned, stopwords("english"))
df$text_cleaned <- removeWords(df$text_cleaned, stopwords("german"))token.df <- df %>%
tidytext::unnest_tokens(word, text_cleaned) %>%
filter(nchar(word) > 1) %>%
filter(nchar(word) < 30)
token.df %>%
count(word, sort = TRUE) %>%
ungroup() %>%
top_n(20, n) %>%
knitr::kable(align="l")| word | n |
|---|---|
| wohnung | 12264 |
| apartment | 9732 |
| zimmer | 8800 |
| room | 8529 |
| min | 8365 |
| berlin | 5994 |
| bahn | 5187 |
| restaurants | 4511 |
| minuten | 4289 |
| flat | 4200 |
| küche | 3877 |
| city | 3862 |
| nähe | 3800 |
| unterkunft | 3488 |
| bars | 3228 |
| qm | 3060 |
| direkt | 2992 |
| liegt | 2983 |
| station | 2955 |
| lage | 2916 |
bigram.df <- df %>%
unnest_tokens(bigram, text_cleaned,
token = "ngrams", n=2)
bigram.df %>%
count(bigram, sort = TRUE) %>%
ungroup() %>%
top_n(20, n) %>%
knitr::kable(align="l")| bigram | n |
|---|---|
| u bahn | 2699 |
| s bahn | 1870 |
| zimmer wohnung | 1497 |
| wohnung liegt | 1287 |
| prenzlauer berg | 1083 |
| living room | 1081 |
| city center | 989 |
| walking distance | 982 |
| unterkunft gut | 936 |
| bars restaurants | 891 |
| paare alleinreisende | 848 |
| gut paare | 832 |
| unterkunft nähe | 811 |
| restaurants bars | 786 |
| alleinreisende abenteurer | 771 |
| wohnung befindet | 751 |
| unmittelbarer nähe | 745 |
| unterkunft lieben | 733 |
| st pauli | 689 |
| lieben wegen | 678 |
corp <- corpus(df$text_cleaned)
docvars(corp)<-df$city #attaching the class labels to the corpus message text
col <- RColorBrewer::brewer.pal(10, "BrBG") c.plot <- corpus_subset(corp, docvar1=="Berlin")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))
textplot_wordcloud(c.plot, min.freq = 250, color = col)c.plot <- corpus_subset(corp, docvar1=="Hamburg")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))
textplot_wordcloud(c.plot, min.freq = 200, color = col)c.plot <- corpus_subset(corp, docvar1=="München")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))
textplot_wordcloud(c.plot, min.freq = 50, color = col)c.plot <- corpus_subset(corp, docvar1=="Köln")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))
textplot_wordcloud(c.plot, min.freq = 50, color = col)c.plot <- corpus_subset(corp, docvar1=="Frankfurt")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))
textplot_wordcloud(c.plot, min.freq = 50, color = col)c.plot <- corpus_subset(corp, docvar1=="Stuttgart")
c.plot<-dfm(c.plot, tolower = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))
textplot_wordcloud(c.plot, min.freq = 50, color = col)